; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes='sroa<preserve-cfg>' -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
; RUN: opt -passes='sroa<modify-cfg>' -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG

%"struct.a" = type { <8 x half> }
%"struct.b" = type { %"struct.a" }
%"struct.c" = type { %"struct.a", i32, i8 }
%"struct.d" = type { [4 x i32], %"struct.a" }
%"struct.e" = type { [2 x <8 x half>], i32, i32 }
%"struct.f" = type { [2 x <8 x i16>], i32, i32 }
%"array.a" = type [2 x <8 x half>]
%"array.b" = type [2 x %"struct.a"]

define amdgpu_kernel void @test_zeroinit() #0 {
; CHECK-LABEL: @test_zeroinit(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
; CHECK-NEXT:    br label [[BB:%.*]]
; CHECK:       bb:
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
; CHECK-NEXT:    ret void
;
entry:
  %b_blockwise_copy = alloca %"struct.b", align 16
  store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
  %data = load <4 x float>, <4 x float>* undef
  store <4 x float> %data, ptr %b_blockwise_copy, align 16
  br label %bb

bb:
  %load1 = load half, ptr %b_blockwise_copy, align 16
  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
  %load2 = load half, ptr %ptr2, align 16
  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
  %load3 = load half, ptr %ptr3, align 16
  ret void
}

define amdgpu_kernel void @test_memset() #0 {
; CHECK-LABEL: @test_memset(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
; CHECK-NEXT:    br label [[BB:%.*]]
; CHECK:       bb:
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
; CHECK-NEXT:    ret void
;
entry:
  %b_blockwise_copy = alloca %"struct.b", align 16
  call void @llvm.memset.p0.i64(ptr align 16 %b_blockwise_copy, i8 0, i64 16, i1 false)
  %data = load <4 x float>, <4 x float>* undef
  store <4 x float> %data, ptr %b_blockwise_copy, align 16
  br label %bb

bb:
  %load1 = load half, ptr %b_blockwise_copy, align 16
  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
  %load2 = load half, ptr %ptr2, align 16
  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
  %load3 = load half, ptr %ptr3, align 16
  ret void
}

define amdgpu_kernel void @vector_type_alloca() #0 {
; CHECK-LABEL: @vector_type_alloca(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
; CHECK-NEXT:    br label [[BB:%.*]]
; CHECK:       bb:
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
; CHECK-NEXT:    ret void
;
entry:
  %b_blockwise_copy = alloca <8 x half>, align 16
  store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
  %data = load <4 x float>, <4 x float>* undef
  store <4 x float> %data, ptr %b_blockwise_copy, align 16
  br label %bb

bb:
  %load1 = load half, ptr %b_blockwise_copy, align 16
  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
  %load2 = load half, ptr %ptr2, align 16
  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
  %load3 = load half, ptr %ptr3, align 16
  ret void
}

define amdgpu_kernel void @test_struct_contain_multiple_types1() #0 {
; CHECK-LABEL: @test_struct_contain_multiple_types1(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
; CHECK-NEXT:    br label [[BB:%.*]]
; CHECK:       bb:
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
; CHECK-NEXT:    ret void
;
entry:
  %b_blockwise_copy = alloca %"struct.c", align 16
  store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
  %data = load <4 x float>, <4 x float>* undef
  store <4 x float> %data, ptr %b_blockwise_copy, align 16
  br label %bb

bb:
  %load1 = load half, ptr %b_blockwise_copy, align 16
  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
  %load2 = load half, ptr %ptr2, align 16
  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
  %load3 = load half, ptr %ptr3, align 16
  ret void
}

define amdgpu_kernel void @test_struct_contain_multiple_types2() #0 {
; CHECK-LABEL: @test_struct_contain_multiple_types2(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[DATA1:%.*]] = load [4 x i32], ptr undef, align 4
; CHECK-NEXT:    [[DATA1_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 0
; CHECK-NEXT:    [[DATA1_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 1
; CHECK-NEXT:    [[DATA1_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 2
; CHECK-NEXT:    [[DATA1_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 3
; CHECK-NEXT:    [[DATA2:%.*]] = load <4 x float>, ptr undef, align 16
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA2]] to <8 x i16>
; CHECK-NEXT:    br label [[BB:%.*]]
; CHECK:       bb:
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5_16_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_5_16_VEC_EXTRACT]] to half
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5_18_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_5_18_VEC_EXTRACT]] to half
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5_20_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_5_20_VEC_EXTRACT]] to half
; CHECK-NEXT:    ret void
;
entry:
  %b_blockwise_copy = alloca %"struct.d", align 16
  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 16, i1 false)
  %data1 = load [4 x i32], [4 x i32]* undef
  store [4 x i32] %data1, ptr %b_blockwise_copy, align 16
  %data2_gep = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
  store <8 x half> zeroinitializer, ptr %data2_gep, align 16
  %data2 = load <4 x float>, <4 x float>* undef
  store <4 x float> %data2, ptr %data2_gep, align 16
  br label %bb

bb:
  %ptr1 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
  %load1 = load half, ptr %ptr1, align 16
  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 18
  %load2 = load half, ptr %ptr2, align 16
  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 20
  %load3 = load half, ptr %ptr3, align 16
  ret void
}

define amdgpu_kernel void @test_struct_array_vector() #0 {
; CHECK-LABEL: @test_struct_array_vector(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[DATA0:%.*]] = load <4 x float>, ptr undef, align 16
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA0]] to <8 x i16>
; CHECK-NEXT:    [[DATA1:%.*]] = load <4 x float>, ptr undef, align 16
; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[DATA1]] to <8 x i16>
; CHECK-NEXT:    br label [[BB:%.*]]
; CHECK:       bb:
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_3_16_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_3_16_VEC_EXTRACT]] to half
; CHECK-NEXT:    ret void
;
entry:
  %b_blockwise_copy = alloca %"struct.e", align 16
  store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
  %0 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
  store <8 x half> zeroinitializer, ptr %0, align 16
  %data0 = load <4 x float>, <4 x float>* undef
  store <4 x float> %data0, ptr %b_blockwise_copy, align 16
  %data1 = load <4 x float>, <4 x float>* undef
  store <4 x float> %data1, ptr %0, align 16
  br label %bb

bb:
  %load1 = load half, ptr %b_blockwise_copy, align 16
  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
  %load2 = load half, ptr %ptr2, align 16
  ret void
}

define amdgpu_kernel void @test_struct_array_vector_i16() #0 {
; CHECK-LABEL: @test_struct_array_vector_i16(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[DATA:%.*]] = load <4 x i32>, ptr undef, align 16
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[DATA]] to <8 x i16>
; CHECK-NEXT:    [[DATA2:%.*]] = load <4 x i32>, ptr undef, align 16
; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[DATA2]] to <8 x i16>
; CHECK-NEXT:    br label [[BB:%.*]]
; CHECK:       bb:
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_16_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
; CHECK-NEXT:    ret void
;
entry:
  %b_blockwise_copy = alloca %"struct.f", align 16
  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
  %data = load <4 x i32>, <4 x i32>* undef
  store <4 x i32> %data, ptr %b_blockwise_copy, align 16
  %data2 = load <4 x i32>, <4 x i32>* undef
  %data2_gep = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
  store <4 x i32> %data2, ptr %data2_gep, align 16
  br label %bb

bb:
  %load1 = load i16, ptr %b_blockwise_copy, align 16
  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
  %load2 = load i16, ptr %ptr2, align 16
  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
  %load3 = load i16, ptr %ptr3, align 16
  ret void
}

define amdgpu_kernel void @test_half_array() #0 {
; CHECK-LABEL: @test_half_array(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0:%.*]] = alloca float, align 16
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4:%.*]] = alloca float, align 4
; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_0]], i8 0, i32 4, i1 false)
; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_4]], i8 0, i32 4, i1 false)
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float undef to i32
; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float undef to i32
; CHECK-NEXT:    [[DATA:%.*]] = load [4 x float], ptr undef, align 4
; CHECK-NEXT:    [[DATA_FCA_0_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 0
; CHECK-NEXT:    store float [[DATA_FCA_0_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
; CHECK-NEXT:    [[DATA_FCA_1_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 1
; CHECK-NEXT:    store float [[DATA_FCA_1_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
; CHECK-NEXT:    [[DATA_FCA_2_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 2
; CHECK-NEXT:    [[DATA_FCA_3_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 3
; CHECK-NEXT:    br label [[BB:%.*]]
; CHECK:       bb:
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_B_BLOCKWISE_COPY_SROA_0_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_0]], i64 2
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_B_BLOCKWISE_COPY_SROA_0_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1]], align 2
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_0_B_BLOCKWISE_COPY_SROA_4_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
; CHECK-NEXT:    ret void
;
entry:
  %b_blockwise_copy = alloca [8 x half], align 16
  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 16, i1 false)
  %data = load [4 x float], [4 x float]* undef
  store [4 x float] %data, ptr %b_blockwise_copy, align 16
  br label %bb

bb:
  %load1 = load half, ptr %b_blockwise_copy, align 16
  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
  %load2 = load half, ptr %ptr2, align 16
  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
  %load3 = load half, ptr %ptr3, align 16
  ret void
}

define amdgpu_kernel void @test_array_vector() #0 {
; CHECK-LABEL: @test_array_vector(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5:%.*]] = alloca <8 x half>, align 16
; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_5]], i8 0, i32 16, i1 false)
; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
; CHECK-NEXT:    br label [[BB:%.*]]
; CHECK:       bb:
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
; CHECK-NEXT:    ret void
;
entry:
  %b_blockwise_copy = alloca %"array.a", align 16
  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
  %data = load <4 x float>, <4 x float>* undef
  store <4 x float> %data, ptr %b_blockwise_copy, align 16
  br label %bb

bb:
  %load1 = load half, ptr %b_blockwise_copy, align 16
  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
  %load2 = load half, ptr %ptr2, align 16
  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
  %load3 = load half, ptr %ptr3, align 16
  ret void
}

define amdgpu_kernel void @test_array_vector2() #0 {
; CHECK-LABEL: @test_array_vector2(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5:%.*]] = alloca <8 x half>, align 16
; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_5]], i8 0, i32 16, i1 false)
; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
; CHECK-NEXT:    br label [[BB:%.*]]
; CHECK:       bb:
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
; CHECK-NEXT:    ret void
;
entry:
  %b_blockwise_copy = alloca %"array.b", align 16
  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
  %data = load <4 x float>, <4 x float>* undef
  store <4 x float> %data, ptr %b_blockwise_copy, align 16
  br label %bb

bb:
  %load1 = load half, ptr %b_blockwise_copy, align 16
  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
  %load2 = load half, ptr %ptr2, align 16
  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
  %load3 = load half, ptr %ptr3, align 16
  ret void
}

define amdgpu_kernel void @test_array_vector_no_vector_common_type() #0 {
; CHECK-LABEL: @test_array_vector_no_vector_common_type(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0:%.*]] = alloca float, align 16
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4:%.*]] = alloca float, align 4
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_7:%.*]] = alloca float, align 8
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_10:%.*]] = alloca float, align 4
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_13:%.*]] = alloca <8 x half>, align 16
; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_0]], i8 0, i32 4, i1 false)
; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_4]], i8 0, i32 4, i1 false)
; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 8 [[B_BLOCKWISE_COPY_SROA_7]], i8 0, i32 4, i1 false)
; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_10]], i8 0, i32 4, i1 false)
; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_13]], i8 0, i32 16, i1 false)
; CHECK-NEXT:    [[DATA1:%.*]] = load float, ptr undef, align 4
; CHECK-NEXT:    [[DATA2:%.*]] = load float, ptr undef, align 4
; CHECK-NEXT:    [[DATA3:%.*]] = load float, ptr undef, align 4
; CHECK-NEXT:    [[DATA4:%.*]] = load float, ptr undef, align 4
; CHECK-NEXT:    store float [[DATA1]], ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
; CHECK-NEXT:    store float [[DATA2]], ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
; CHECK-NEXT:    store float [[DATA3]], ptr [[B_BLOCKWISE_COPY_SROA_7]], align 8
; CHECK-NEXT:    store float [[DATA4]], ptr [[B_BLOCKWISE_COPY_SROA_10]], align 4
; CHECK-NEXT:    br label [[BB:%.*]]
; CHECK:       bb:
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_B_BLOCKWISE_COPY_SROA_0_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_0]], i64 2
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_B_BLOCKWISE_COPY_SROA_0_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1]], align 2
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_0_B_BLOCKWISE_COPY_SROA_4_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_2_PTR4_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_4]], i64 2
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_2_B_BLOCKWISE_COPY_SROA_4_6_LOAD4:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4_2_PTR4_SROA_IDX]], align 2
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_7_0_B_BLOCKWISE_COPY_SROA_7_8_LOAD5:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_7]], align 8
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_7_2_PTR6_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_7]], i64 2
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_7_2_B_BLOCKWISE_COPY_SROA_7_10_LOAD6:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_7_2_PTR6_SROA_IDX]], align 2
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_10_0_B_BLOCKWISE_COPY_SROA_10_12_LOAD7:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_10]], align 4
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_10_2_PTR8_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_10]], i64 2
; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_10_2_B_BLOCKWISE_COPY_SROA_10_14_LOAD8:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_10_2_PTR8_SROA_IDX]], align 2
; CHECK-NEXT:    ret void
;
entry:
  %b_blockwise_copy = alloca %"array.a", align 16
  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
  %data1 = load float, float* undef
  %data2 = load float, float* undef
  %data3 = load float, float* undef
  %data4 = load float, float* undef
  store float %data1, ptr %b_blockwise_copy, align 16
  %data_ptr1 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
  store float %data2, ptr %data_ptr1, align 16
  %data_ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 8
  store float %data3, ptr %data_ptr2, align 16
  %data_ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 12
  store float %data4, ptr %data_ptr3, align 16
  br label %bb

bb:
  %load1 = load half, ptr %b_blockwise_copy, align 16
  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
  %load2 = load half, ptr %ptr2, align 16
  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
  %load3 = load half, ptr %ptr3, align 16
  %ptr4 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 6
  %load4 = load half, ptr %ptr4, align 16
  %ptr5 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 8
  %load5 = load half, ptr %ptr5, align 16
  %ptr6 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 10
  %load6 = load half, ptr %ptr6, align 16
  %ptr7 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 12
  %load7 = load half, ptr %ptr7, align 16
  %ptr8 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 14
  %load8 = load half, ptr %ptr8, align 16
  ret void
}

declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0
declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1) nounwind
declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1) nounwind
attributes #0 = { nounwind readonly }

;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK-MODIFY-CFG: {{.*}}
; CHECK-PRESERVE-CFG: {{.*}}
